// vim: set syntax=asm :

{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to %}
{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to %}
{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to %}
{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to %}
{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to %}
{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %}

{{L}}leaky_relu:
    // can only use zmm12 to zmm15
    // ymm15 <- alpha
    vbroadcastss    zmm15, dword ptr [rdi + 8]
    // ymm14 <- all zero
    vpxorq          zmm14, zmm14, zmm14

    {% for reg in (from..to) %}
        vcmpps      k1, zmm{{reg}}, zmm14, 1 // 1 means LT
        // ymm12 <- alpha * x if < 0
        vmulps      zmm{{reg}} {k1}, zmm{{reg}}, zmm15
    {% endfor %}
    // select muled of orginal

    jmp    {{L}}non_linear_loop

{{L}}q_scale:
{{L}}q_shl:
{{L}}q_shr:
    jmp {{L}}unsupported
